In [1]:
%load_ext autoreload
%autoreload 2

import networkx as nx
import numpy as np
import geopandas as gpd
import pandas as pd

import scanpy as sc
import voyagerpy as vp

from anndata import AnnData
from matplotlib import pyplot as plt
from matplotlib.pyplot import rc_context
import pathlib
import json

Downloading the data¶

We download the dataset from the 10X website. These are two gzipped tar archives containing the unfiltered gene count matrix and the spatial information

In [2]:
import requests

outs_dir = pathlib.Path('data/visium_10x/outs')
outs_dir.mkdir(parents=True, exist_ok=True)
root_dir = (outs_dir / '..').resolve()

# Download the gene count matrix
tar_path_ob = root_dir / 'visium_ob.tar.gz'
url_reads = "https://cf.10xgenomics.com/samples/spatial-exp/2.0.0/Visium_Mouse_Olfactory_Bulb/Visium_Mouse_Olfactory_Bulb_raw_feature_bc_matrix.tar.gz"
if not tar_path_ob.exists():
    res = requests.get(url_reads)
    with tar_path_ob.open('wb') as f:
        f.write(res.content)

# Download the spatial information
tar_path_sp =  root_dir / 'visium_ob_spatial.tar.gz'
url_spatial = "https://cf.10xgenomics.com/samples/spatial-exp/2.0.0/Visium_Mouse_Olfactory_Bulb/Visium_Mouse_Olfactory_Bulb_spatial.tar.gz"
if not tar_path_sp.exists():
    res = requests.get(url_spatial)
    with tar_path_sp.open('wb') as f:
        f.write(res.content)

# Decompress the downloaded files
!tar -xvf $tar_path_ob -C $outs_dir 
!tar -xvf $tar_path_sp -C $outs_dir 
x raw_feature_bc_matrix/
x raw_feature_bc_matrix/features.tsv.gz
x raw_feature_bc_matrix/barcodes.tsv.gz
x raw_feature_bc_matrix/matrix.mtx.gz
x spatial/
x spatial/spatial_enrichment.csv
x spatial/tissue_lowres_image.png
x spatial/tissue_positions.csv
x spatial/detected_tissue_image.jpg
x spatial/aligned_fiducials.jpg
x spatial/scalefactors_json.json
x spatial/tissue_hires_image.png

This is what the layout of the files look like. The outputs in the spatial directory is explained here on the 10X website.

In [3]:
def print_dir_content(p, d=0):
    print(' '*(d*2) + p.name, end='/\n' if p.is_dir() else '\n')
    if p.is_dir():
        for sub in sorted(p.iterdir()):
            print_dir_content(sub, d+1)

has_tree = !command -v tree
if has_tree:
    !tree $outs_dir
else:
    print_dir_content(outs_dir)
data/visium_10x/outs
├── raw_feature_bc_matrix
│   ├── barcodes.tsv.gz
│   ├── features.tsv.gz
│   └── matrix.mtx.gz
└── spatial
    ├── aligned_fiducials.jpg
    ├── detected_tissue_image.jpg
    ├── scalefactors_json.json
    ├── spatial_enrichment.csv
    ├── tissue_hires_image.png
    ├── tissue_lowres_image.png
    ├── tissue_positions.csv
    └── tissue_positions_list.csv

3 directories, 11 files

The tissue_hires_image.png file is a relatively high resolution image of the tissue, but not full resolution. The tissue_lowres_image.png file is a low resolution image of the tissue, suitable for quick plotting.

In [4]:
im = plt.imread(outs_dir/'spatial' / 'tissue_hires_image.png')
fig, ax = plt.subplots()
ax.imshow(im, origin='lower')
plt.show()
In [5]:
json.load((outs_dir / 'spatial' / 'scalefactors_json.json').open())
Out[5]:
{'tissue_hires_scalef': 0.2,
 'tissue_lowres_scalef': 0.06,
 'fiducial_diameter_fullres': 118.91545,
 'spot_diameter_fullres': 73.61433}
In [6]:
pd.read_csv(outs_dir / 'spatial' / 'tissue_positions.csv').head()
Out[6]:
barcode in_tissue array_row array_col pxl_row_in_fullres pxl_col_in_fullres
0 ACGCCTGACACGCGCT-1 0 0 0 8668 1102
1 TACCGATCCAACACTT-1 0 1 1 8611 1200
2 ATTAAAGCGGACGAGC-1 0 0 2 8554 1102
3 GATAAGGGACGATTAG-1 0 1 3 8498 1200
4 GTGCAAATCACCAATA-1 0 0 4 8441 1102
In [7]:
pd.read_csv(outs_dir / 'spatial' / 'spatial_enrichment.csv').head()
Out[7]:
Feature ID Feature Name Feature Type I P value Adjusted p value Feature Counts in Spots Under Tissue Median Normalized Average Counts Barcodes Detected per Feature
0 ENSMUSG00000001023 S100a5 Gene Expression 0.770905 0.0 0.0 9019 15.848669 1021
1 ENSMUSG00000019874 Fabp7 Gene Expression 0.698735 0.0 0.0 13462 20.679932 1170
2 ENSMUSG00000002985 Apoe Gene Expression 0.694521 0.0 0.0 67509 76.635169 1184
3 ENSMUSG00000025739 Gng13 Gene Expression 0.658575 0.0 0.0 5260 8.803694 1050
4 ENSMUSG00000090223 Pcp4 Gene Expression 0.631703 0.0 0.0 45118 25.811125 1133

Here we read the Space Ranger output as an AnnData object

In [8]:
adata = vp.read_10x_visium(
    outs_dir,
    datatype = 'mtx',
    raw = True,
    prefix = None,
    symbol_as_index=False,
    dtype=np.float64
)
In [9]:
_ = vp.plt.imshow(adata)
In [10]:
fig, axs = plt.subplots(2, 2, figsize=(10,10))

vp.plt.imshow(adata, ax=axs[0,0], title='Original')

vp.spatial.rotate_img90(adata, k=-1, apply=False)
vp.plt.imshow(adata, tmp=True, ax=axs[0,1], title='Rotated 90° clockwise - not applied')

vp.spatial.mirror_img(adata, axis=1, apply=False)
vp.plt.imshow(adata, tmp=True, ax=axs[1, 0], title='Mirror rows - not applied')

vp.spatial.apply_transforms(adata)
_ = vp.plt.imshow(adata, ax=axs[1,1], title='Transformed image - changes applied')

Quality Control (QC)¶

In [11]:
if adata.uns['config']['var_names'] == 'symbol':
    is_mt = adata.var.index.to_series().str.contains('^mt-').values
else:
    is_mt = adata.var['symbol'].str.contains('^mt-').values


adata = vp.utl.calculate_metrics(adata)
vp.utils.add_per_cell_qcmetrics(adata, subsets={'mito': is_mt})
adata = vp.spatial.get_geom(adata)

_ = vp.plt.plot_spatial_feature(
    adata,
    ['sum', 'detected', 'subsets_mito_percent'],
    tissue=False,
    ncol=2
)
In [12]:
adata.obs['in_tissue'] = adata.obs['in_tissue'].astype('category')
axs = vp.plt.plot_barcode_data(
    adata, 
    y=['sum', 'detected', 'subsets_mito_percent'],
    x='in_tissue',
    ncol=3,
    figsize=(8, 4),
    cmap='tab10'
)
In [13]:
_ = vp.plt.plot_barcode_data(
    adata, 
    x='sum', 
    y='subsets_mito_percent', 
    color_by='in_tissue', 
    cmap='tab10'
)
In [14]:
adata_tissue = adata[adata.obs["in_tissue"]==1]
adata_tissue = vp.spatial.get_geom(adata_tissue)
In [15]:
_ = vp.plotting.plot_barcodes_bin2d(
    adata_tissue, 
    x='sum', 
    y='detected',
    bins=76,
    cmin=1
)

In order to preserve the counts and the log-normalized counts, we save them as layers. This is because we will normalize adata_tissue.X before we perform PCA in this notebook.

In [16]:
adata_tissue.layers['counts'] = adata_tissue.X.copy()
vp.utils.log_norm_counts(adata_tissue, inplace=True)
adata_tissue.layers['logcounts'] = adata_tissue.X.copy()

In order to compare this notebook to the corresponding R vignette, we use the top highly variable genes acquired from scran's modelGeneVar and getTopHVGs.

In [17]:
use_R_generated_hvgs = True
hvgs_file = (root_dir / 'hvgs.txt')

if use_R_generated_hvgs and hvgs_file.exists():
    with hvgs_file.open() as f:
        hvgs = list(map(str.strip, f.readlines()))
        adata_tissue.var['highly_variable'] = False

        if adata_tissue.uns['config']['var_names'] == 'gene_ids':
            adata_tissue.var.loc[hvgs, 'highly_variable'] = True
        else:
            adata_tissue.var['highly_variable'] = adata_tissue.var['gene_ids'].isin(hvgs)
else:
    sc.pp.highly_variable_genes(adata_tissue, flavour='cell_ranger', n_top_genes=2000)

Dimension reduction and clustering¶

In the companion vignette, the data is scaled prior to computing the PCA. Thus, we follow suit.

In [18]:
# scale first, then pca

use_sc = False
if use_sc:
    # Use scanpy scaling
    sc.pp.scale(adata_tissue, zero_center=True)
else:
    # Use voyagerpy scaling
    adata_tissue.X = vp.utils.scale(adata_tissue.X, center=True)

sc.tl.pca(adata_tissue, use_highly_variable=True, n_comps=30, random_state=1337)
In [19]:
_ = vp.plt.elbow_plot(adata_tissue, ndims=30)
In [20]:
ax = vp.plt.plot_dim_loadings(adata_tissue, range(5), ncol=2, show_symbol=True)

Cluster the reduced dimension of the barcodes.

In [21]:
sc.pp.neighbors(adata_tissue, n_pcs=3, use_rep='X_pca')
sc.tl.leiden(
    adata_tissue, 
    random_state=29, 
    resolution=0.3, 
    key_added='cluster',
)
# sc.tl.umap(adata_tissue)
In [22]:
ax = vp.plt.plot_pca(adata_tissue, figsize=(7,7), ndim=5, colorby='cluster', cmap='tab10')
In [23]:
_ = vp.plt.plot_spatial_feature(adata_tissue, 'cluster', cmap='dittoseq', barcode_geom='spot_poly')
In [24]:
pl = vp.plt.spatial_reduced_dim(
    adata_tissue, 
    "X_pca", 
    ncomponents=5, 
    barcode_geom="spot_poly", ncol=2, divergent=True, div_cmap='roma')
In [25]:
genes_use = ['Gm42418', 'Syt1', 'Pcp4', 'Ly6g6e', 'mt-Nd1','Aqp1', 'Ecrg4']
_ = vp.plt.plot_expression(
    adata_tissue, 
    genes_use, 
    groupby='cluster', 
    show_symbol=True, 
    layer='logcounts'
)
In [26]:
_ = vp.plt.plot_spatial_feature(
    adata_tissue,
    genes_use, 
    barcode_geom = "spot_poly", 
    ncol = 2,
    layer='logcounts',
)